import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
sns.set(style='darkgrid')
df = pd.read_csv('winequality-red.csv')
df.head()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
df.columns
Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol', 'quality'],
dtype='object')
df.shape
(1599, 12)
df.isna().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1599 entries, 0 to 1598 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1599 non-null float64 1 volatile acidity 1599 non-null float64 2 citric acid 1599 non-null float64 3 residual sugar 1599 non-null float64 4 chlorides 1599 non-null float64 5 free sulfur dioxide 1599 non-null float64 6 total sulfur dioxide 1599 non-null float64 7 density 1599 non-null float64 8 pH 1599 non-null float64 9 sulphates 1599 non-null float64 10 alcohol 1599 non-null float64 11 quality 1599 non-null int64 dtypes: float64(11), int64(1) memory usage: 150.0 KB
df.describe()
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 | 1599.000000 |
| mean | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| std | 1.741096 | 0.179060 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
| 25% | 7.100000 | 0.390000 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 22.000000 | 0.995600 | 3.210000 | 0.550000 | 9.500000 | 5.000000 |
| 50% | 7.900000 | 0.520000 | 0.260000 | 2.200000 | 0.079000 | 14.000000 | 38.000000 | 0.996750 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
| 75% | 9.200000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 62.000000 | 0.997835 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
| max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 72.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| fixed acidity | 1.000000 | -0.256131 | 0.671703 | 0.114777 | 0.093705 | -0.153794 | -0.113181 | 0.668047 | -0.682978 | 0.183006 | -0.061668 | 0.124052 |
| volatile acidity | -0.256131 | 1.000000 | -0.552496 | 0.001918 | 0.061298 | -0.010504 | 0.076470 | 0.022026 | 0.234937 | -0.260987 | -0.202288 | -0.390558 |
| citric acid | 0.671703 | -0.552496 | 1.000000 | 0.143577 | 0.203823 | -0.060978 | 0.035533 | 0.364947 | -0.541904 | 0.312770 | 0.109903 | 0.226373 |
| residual sugar | 0.114777 | 0.001918 | 0.143577 | 1.000000 | 0.055610 | 0.187049 | 0.203028 | 0.355283 | -0.085652 | 0.005527 | 0.042075 | 0.013732 |
| chlorides | 0.093705 | 0.061298 | 0.203823 | 0.055610 | 1.000000 | 0.005562 | 0.047400 | 0.200632 | -0.265026 | 0.371260 | -0.221141 | -0.128907 |
| free sulfur dioxide | -0.153794 | -0.010504 | -0.060978 | 0.187049 | 0.005562 | 1.000000 | 0.667666 | -0.021946 | 0.070377 | 0.051658 | -0.069408 | -0.050656 |
| total sulfur dioxide | -0.113181 | 0.076470 | 0.035533 | 0.203028 | 0.047400 | 0.667666 | 1.000000 | 0.071269 | -0.066495 | 0.042947 | -0.205654 | -0.185100 |
| density | 0.668047 | 0.022026 | 0.364947 | 0.355283 | 0.200632 | -0.021946 | 0.071269 | 1.000000 | -0.341699 | 0.148506 | -0.496180 | -0.174919 |
| pH | -0.682978 | 0.234937 | -0.541904 | -0.085652 | -0.265026 | 0.070377 | -0.066495 | -0.341699 | 1.000000 | -0.196648 | 0.205633 | -0.057731 |
| sulphates | 0.183006 | -0.260987 | 0.312770 | 0.005527 | 0.371260 | 0.051658 | 0.042947 | 0.148506 | -0.196648 | 1.000000 | 0.093595 | 0.251397 |
| alcohol | -0.061668 | -0.202288 | 0.109903 | 0.042075 | -0.221141 | -0.069408 | -0.205654 | -0.496180 | 0.205633 | 0.093595 | 1.000000 | 0.476166 |
| quality | 0.124052 | -0.390558 | 0.226373 | 0.013732 | -0.128907 | -0.050656 | -0.185100 | -0.174919 | -0.057731 | 0.251397 | 0.476166 | 1.000000 |
pH needs to be removed because it not correlated to the target variable and is highly correlated to the
x1 = df[[ 'volatile acidity', 'residual sugar',
'chlorides', 'total sulfur dioxide', 'density','citric acid',
'sulphates', 'alcohol','quality']]
y1 = df['quality']
corr = x1.corr()
corr.style.background_gradient(cmap='coolwarm')
| volatile acidity | residual sugar | chlorides | total sulfur dioxide | density | citric acid | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|
| volatile acidity | 1.000000 | 0.001918 | 0.061298 | 0.076470 | 0.022026 | -0.552496 | -0.260987 | -0.202288 | -0.390558 |
| residual sugar | 0.001918 | 1.000000 | 0.055610 | 0.203028 | 0.355283 | 0.143577 | 0.005527 | 0.042075 | 0.013732 |
| chlorides | 0.061298 | 0.055610 | 1.000000 | 0.047400 | 0.200632 | 0.203823 | 0.371260 | -0.221141 | -0.128907 |
| total sulfur dioxide | 0.076470 | 0.203028 | 0.047400 | 1.000000 | 0.071269 | 0.035533 | 0.042947 | -0.205654 | -0.185100 |
| density | 0.022026 | 0.355283 | 0.200632 | 0.071269 | 1.000000 | 0.364947 | 0.148506 | -0.496180 | -0.174919 |
| citric acid | -0.552496 | 0.143577 | 0.203823 | 0.035533 | 0.364947 | 1.000000 | 0.312770 | 0.109903 | 0.226373 |
| sulphates | -0.260987 | 0.005527 | 0.371260 | 0.042947 | 0.148506 | 0.312770 | 1.000000 | 0.093595 | 0.251397 |
| alcohol | -0.202288 | 0.042075 | -0.221141 | -0.205654 | -0.496180 | 0.109903 | 0.093595 | 1.000000 | 0.476166 |
| quality | -0.390558 | 0.013732 | -0.128907 | -0.185100 | -0.174919 | 0.226373 | 0.251397 | 0.476166 | 1.000000 |
corr = x1.corr()
corr.style.background_gradient(cmap='coolwarm')
| volatile acidity | residual sugar | chlorides | total sulfur dioxide | density | citric acid | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|
| volatile acidity | 1.000000 | 0.001918 | 0.061298 | 0.076470 | 0.022026 | -0.552496 | -0.260987 | -0.202288 | -0.390558 |
| residual sugar | 0.001918 | 1.000000 | 0.055610 | 0.203028 | 0.355283 | 0.143577 | 0.005527 | 0.042075 | 0.013732 |
| chlorides | 0.061298 | 0.055610 | 1.000000 | 0.047400 | 0.200632 | 0.203823 | 0.371260 | -0.221141 | -0.128907 |
| total sulfur dioxide | 0.076470 | 0.203028 | 0.047400 | 1.000000 | 0.071269 | 0.035533 | 0.042947 | -0.205654 | -0.185100 |
| density | 0.022026 | 0.355283 | 0.200632 | 0.071269 | 1.000000 | 0.364947 | 0.148506 | -0.496180 | -0.174919 |
| citric acid | -0.552496 | 0.143577 | 0.203823 | 0.035533 | 0.364947 | 1.000000 | 0.312770 | 0.109903 | 0.226373 |
| sulphates | -0.260987 | 0.005527 | 0.371260 | 0.042947 | 0.148506 | 0.312770 | 1.000000 | 0.093595 | 0.251397 |
| alcohol | -0.202288 | 0.042075 | -0.221141 | -0.205654 | -0.496180 | 0.109903 | 0.093595 | 1.000000 | 0.476166 |
| quality | -0.390558 | 0.013732 | -0.128907 | -0.185100 | -0.174919 | 0.226373 | 0.251397 | 0.476166 | 1.000000 |
corr = x1.corr()
corr.style.background_gradient(cmap='coolwarm')
| volatile acidity | residual sugar | chlorides | total sulfur dioxide | density | citric acid | sulphates | alcohol | quality | |
|---|---|---|---|---|---|---|---|---|---|
| volatile acidity | 1.000000 | 0.001918 | 0.061298 | 0.076470 | 0.022026 | -0.552496 | -0.260987 | -0.202288 | -0.390558 |
| residual sugar | 0.001918 | 1.000000 | 0.055610 | 0.203028 | 0.355283 | 0.143577 | 0.005527 | 0.042075 | 0.013732 |
| chlorides | 0.061298 | 0.055610 | 1.000000 | 0.047400 | 0.200632 | 0.203823 | 0.371260 | -0.221141 | -0.128907 |
| total sulfur dioxide | 0.076470 | 0.203028 | 0.047400 | 1.000000 | 0.071269 | 0.035533 | 0.042947 | -0.205654 | -0.185100 |
| density | 0.022026 | 0.355283 | 0.200632 | 0.071269 | 1.000000 | 0.364947 | 0.148506 | -0.496180 | -0.174919 |
| citric acid | -0.552496 | 0.143577 | 0.203823 | 0.035533 | 0.364947 | 1.000000 | 0.312770 | 0.109903 | 0.226373 |
| sulphates | -0.260987 | 0.005527 | 0.371260 | 0.042947 | 0.148506 | 0.312770 | 1.000000 | 0.093595 | 0.251397 |
| alcohol | -0.202288 | 0.042075 | -0.221141 | -0.205654 | -0.496180 | 0.109903 | 0.093595 | 1.000000 | 0.476166 |
| quality | -0.390558 | 0.013732 | -0.128907 | -0.185100 | -0.174919 | 0.226373 | 0.251397 | 0.476166 | 1.000000 |
sns.relplot(x="fixed acidity", y="density",data=df)
<seaborn.axisgrid.FacetGrid at 0x2a592d09940>
sns.relplot(x="volatile acidity", y="density",size="quality", sizes=(20,100), hue="quality",data=df)
<seaborn.axisgrid.FacetGrid at 0x2a593129fa0>
sns.relplot(x = "citric acid", y = "pH", hue = "alcohol", col = "quality", data = df);
df.groupby('quality')['fixed acidity'].sum().reset_index()
| quality | fixed acidity | |
|---|---|---|
| 0 | 3 | 83.6 |
| 1 | 4 | 412.3 |
| 2 | 5 | 5561.9 |
| 3 | 6 | 5325.5 |
| 4 | 7 | 1765.6 |
| 5 | 8 | 154.2 |
df.groupby('quality')['volatile acidity'].sum().reset_index()
| quality | volatile acidity | |
|---|---|---|
| 0 | 3 | 8.845 |
| 1 | 4 | 36.780 |
| 2 | 5 | 392.965 |
| 3 | 6 | 317.395 |
| 4 | 7 | 80.380 |
| 5 | 8 | 7.620 |
df.groupby('quality')['citric acid'].sum().reset_index()
| quality | citric acid | |
|---|---|---|
| 0 | 3 | 1.71 |
| 1 | 4 | 9.23 |
| 2 | 5 | 165.95 |
| 3 | 6 | 174.70 |
| 4 | 7 | 74.66 |
| 5 | 8 | 7.04 |
df.groupby('quality')['residual sugar'].sum().reset_index()
| quality | residual sugar | |
|---|---|---|
| 0 | 3 | 26.35 |
| 1 | 4 | 142.80 |
| 2 | 5 | 1722.15 |
| 3 | 6 | 1580.45 |
| 4 | 7 | 541.40 |
| 5 | 8 | 46.40 |
df.groupby('quality')['chlorides'].sum().reset_index()
| quality | chlorides | |
|---|---|---|
| 0 | 3 | 1.225 |
| 1 | 4 | 4.806 |
| 2 | 5 | 63.153 |
| 3 | 6 | 54.202 |
| 4 | 7 | 15.241 |
| 5 | 8 | 1.232 |
df.groupby('quality')['free sulfur dioxide'].sum().reset_index()
| quality | free sulfur dioxide | |
|---|---|---|
| 0 | 3 | 110.0 |
| 1 | 4 | 650.0 |
| 2 | 5 | 11566.0 |
| 3 | 6 | 10024.0 |
| 4 | 7 | 2795.0 |
| 5 | 8 | 239.0 |
df.groupby('quality')['total sulfur dioxide'].sum().reset_index()
| quality | total sulfur dioxide | |
|---|---|---|
| 0 | 3 | 249.0 |
| 1 | 4 | 1921.0 |
| 2 | 5 | 38486.0 |
| 3 | 6 | 26075.0 |
| 4 | 7 | 6969.0 |
| 5 | 8 | 602.0 |
df.groupby('quality')['density'].sum().reset_index()
| quality | density | |
|---|---|---|
| 0 | 3 | 9.97464 |
| 1 | 4 | 52.81675 |
| 2 | 5 | 679.02757 |
| 3 | 6 | 635.84041 |
| 4 | 7 | 198.22475 |
| 5 | 8 | 17.91382 |
df.groupby('quality')['pH'].sum().reset_index()
| quality | pH | |
|---|---|---|
| 0 | 3 | 33.98 |
| 1 | 4 | 179.22 |
| 2 | 5 | 2250.67 |
| 3 | 6 | 2116.93 |
| 4 | 7 | 654.86 |
| 5 | 8 | 58.81 |
df.groupby('quality')['sulphates'].sum().reset_index()
| quality | sulphates | |
|---|---|---|
| 0 | 3 | 5.70 |
| 1 | 4 | 31.61 |
| 2 | 5 | 422.88 |
| 3 | 6 | 430.86 |
| 4 | 7 | 147.51 |
| 5 | 8 | 13.82 |
df.groupby('quality')['alcohol'].sum().reset_index()
| quality | alcohol | |
|---|---|---|
| 0 | 3 | 99.550000 |
| 1 | 4 | 544.050000 |
| 2 | 5 | 6741.700000 |
| 3 | 6 | 6781.633333 |
| 4 | 7 | 2281.716667 |
| 5 | 8 | 217.700000 |
Categorical plots
sns.catplot(x='quality',y='residual sugar',data=df)
<seaborn.axisgrid.FacetGrid at 0x2a593b013d0>
We cannot use swarmplot in this dataset because all the variables are continuous and only the quality variable is discrete
sns.boxenplot(x="quality",y="chlorides",data=df)
<AxesSubplot:xlabel='quality', ylabel='chlorides'>
sns.boxplot(x="quality",y="density",data=df)
<AxesSubplot:xlabel='quality', ylabel='density'>
sns.violinplot(x ="quality", y = "alcohol", data = df)
<AxesSubplot:xlabel='quality', ylabel='alcohol'>
sns.violinplot(x="quality", y="free sulfur dioxide", bw=.25, split=True, palette= "pastel", inner= "stick", data=df)
<AxesSubplot:xlabel='quality', ylabel='free sulfur dioxide'>
sns.barplot(x="quality",y="total sulfur dioxide",palette = "ch:.25",data=df)
<AxesSubplot:xlabel='quality', ylabel='total sulfur dioxide'>
sns.pointplot(x = "quality", y = "alcohol", data= df)
<AxesSubplot:xlabel='quality', ylabel='alcohol'>
Additional Plots that is not done in class
sns.kdeplot(data=df, x="quality", multiple="layer") #kde=kernal density estimation
<AxesSubplot:xlabel='quality', ylabel='Density'>
df.quality.plot(kind='hist',edgecolor='black')
<AxesSubplot:ylabel='Frequency'>
sns.pairplot(data=df, hue="quality")
<seaborn.axisgrid.PairGrid at 0x2a593e36df0>
sns.stripplot(x="quality", y="chlorides", data=df)
<AxesSubplot:xlabel='quality', ylabel='chlorides'>
sns.scatterplot(data=df, x="pH", y="density")
sns.rugplot(data=df, x="pH", y="density")
<AxesSubplot:xlabel='pH', ylabel='density'>
sns.ecdfplot(data=df, y="quality")
<AxesSubplot:xlabel='Proportion', ylabel='quality'>
import sweetviz as sv
sweet_report = sv.analyze(df)
sweet_report.show_html('sweetviz_report.html')
Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df = AV.AutoViz('winequality-red.csv')
Shape of your Data Set loaded: (1599, 12)
#######################################################################################
######################## C L A S S I F Y I N G V A R I A B L E S ####################
#######################################################################################
Classifying variables in data set...
12 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Number of All Scatter Plots = 66
Time to run AutoViz = 6 seconds
###################### AUTO VISUALIZATION Completed ########################